package com.fangyuan.anjuke;

import com.fangyuan.model.Fangyuan;
import org.assertj.core.util.Lists;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.model.annotation.HelpUrl;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.processor.example.GithubRepoPageProcessor;
import us.codecraft.webmagic.selector.Selectable;

import java.util.List;

public class AnJuKePageProcessor implements PageProcessor {
    static String  startUrl = "https://sh.zu.anjuke.com/fangyuan/hongkou/";
    private Site site = Site.me().setCharset("UTF-8")
            .addHeader("Connection", "keep-alive")
            .addHeader("Cache-Control", "max-age=0")
            .addHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0").setRetryTimes(3).setSleepTime(100);

    public void process(Page page) {
        List<Fangyuan> fangyuans=Lists.newArrayList();
        List<Selectable> nodes = page.getHtml().xpath("//*[@id=\"list-content\"]").$("div[class='zu-itemmod']").nodes();
        for (Selectable selectable:
             nodes) {
            Fangyuan fangyuan = new Fangyuan();
            fangyuan.setContactPhoneNumber(selectable.xpath("a").links().get());
            List<String> all = selectable.$("div[class='zu-info']").$("p[class='details-item bot-tag']").xpath("span/text()").all();
            fangyuan.setCommunity(selectable.$("div[class='zu-info']").xpath("address[@class='details-item']/a/text()").toString());
            fangyuan.setAddress(selectable.$("div[class='zu-info']").xpath("address[@class='details-item']/text()").toString());
            fangyuan.setDoorModel(selectable.$("div[class='zu-info']").xpath("p[@class='details-item tag']/text()").regex(".室.厅").toString());
            fangyuan.setFloor( selectable.$("div[class='zu-info']").xpath("p[@class='details-item tag']/text()").regex(".层").toString());
            fangyuan.setTheHeight(selectable.$("div[class='zu-info']").xpath("p[@class='details-item tag']/text()").regex("\\(共[0-9]{2,}层\\)").toString());
            fangyuan.setSquare(selectable.$("div[class='zu-info']").xpath("p[@class='details-item tag']/text()").regex("[0-9]{2,}").toString());
            fangyuan.setTheSubway(selectable.$("div[class='zu-info']").$("p[class='details-item bot-tag']").xpath("span[@class='cls-4']/text()").toString());
            fangyuan.setWhetherTheWholeRent(selectable.$("div[class='zu-info']").$("p[class='details-item bot-tag']").xpath("span[@class='cls-1']/text()").toString());
            fangyuan.setToward(selectable.$("div[class='zu-info']").$("p[class='details-item bot-tag']").xpath("span[@class='cls-2']/text()").toString());
            fangyuan.setSource("安居客");
            fangyuan.setTheContact(selectable.$("div[class='zu-info']").xpath("p[@class='details-item tag']").regex("</i>.*").toString());
            fangyuans.add(fangyuan);
        }
        page.putField("fangyuans",fangyuans);
        // 部分三：从页面发现后续的url地址来抓取
        page.addTargetRequests(page.getHtml().xpath("/html/body/div[5]/div[3]/div[3]/div/a[4]").links().all());
    }

    public Site getSite() {
        return site;
    }

    public static void main(String[] args) {
        Spider.create(new AnJuKePageProcessor()).addPipeline(new AnJuKePipeline()).addUrl(startUrl).thread(5).run();
    }
}
